//******************************************************************************
//
// File:    PrimeCountFunctionHyb.java
// Package: benchmarks.detinfer.pj.edu.rithyb.prime
// Unit:    Class benchmarks.detinfer.pj.edu.rithyb.prime.PrimeCountFunctionHyb
//
// This Java source file is copyright (C) 2008 by Alan Kaminsky. All rights
// reserved. For further information, contact the author, Alan Kaminsky, at
// ark@cs.rit.edu.
//
// This Java source file is part of the Parallel Java Library ("PJ"). PJ is free
// software; you can redistribute it and/or modify it under the terms of the GNU
// General Public License as published by the Free Software Foundation; either
// version 3 of the License, or (at your option) any later version.
//
// PJ is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
// A PARTICULAR PURPOSE. See the GNU General Public License for more details.
//
// A copy of the GNU General Public License is provided in the file gpl.txt. You
// may also obtain a copy of the GNU General Public License on the World Wide
// Web at http://www.gnu.org/licenses/gpl.html.
//
//******************************************************************************

package benchmarks.detinfer.pj.edu.rithyb.prime;

import benchmarks.detinfer.pj.edu.ritmp.IntegerBuf;
import benchmarks.detinfer.pj.edu.ritmp.LongBuf;
import benchmarks.detinfer.pj.edu.ritmp.ObjectBuf;

import benchmarks.detinfer.pj.edu.ritmp.buf.LongItemBuf;
import benchmarks.detinfer.pj.edu.ritmp.buf.ObjectItemBuf;

import benchmarks.detinfer.pj.edu.ritpj.Comm;
import benchmarks.detinfer.pj.edu.ritpj.CommStatus;
import benchmarks.detinfer.pj.edu.ritpj.LongForLoop;
import benchmarks.detinfer.pj.edu.ritpj.LongSchedule;
import benchmarks.detinfer.pj.edu.ritpj.ParallelRegion;
import benchmarks.detinfer.pj.edu.ritpj.ParallelSection;
import benchmarks.detinfer.pj.edu.ritpj.ParallelTeam;

import benchmarks.detinfer.pj.edu.ritpj.reduction.LongOp;
import benchmarks.detinfer.pj.edu.ritpj.reduction.SharedLong;

import benchmarks.detinfer.pj.edu.ritutil.LongRange;

import java.io.File;

/**
 * Class PrimeCountFunctionHyb is a hybrid parallel program that calculates the
 * prime counting function &pi;(<I>x</I>). &pi;(<I>x</I>) is the number of
 * primes less than or equal to <I>x</I>. The program uses a list of 32-bit
 * primes stored in a file. The prime file must be generated by the {@linkplain
 * Prime32File} program. To find the primes, the program calculates a series of
 * sieves. Each sieve consists of one million numbers.
 * <P>
 * The program runs with one process per node and multiple threads per process.
 * The program uses the master-worker pattern for load balancing. Each process
 * in the program is an independent worker process. Each worker process
 * calculates a series of groups of sieves, as assigned by the master thread.
 * Within each group of sieves, the threads of the worker process calculate the
 * individual sieves in parallel.
 * <P>
 * The groups of sieves are determined by the <TT>pj.schedule</TT> property
 * specified on the command line; the default is to divide the sieves evenly
 * among the worker processes (i.e. no load balancing). For further information
 * about the <TT>pj.schedule</TT> property, see class {@linkplain
 * benchmarks.detinfer.pj.edu.ritpj.PJProperties PJProperties}.
 * <P>
 * Within each group of sieves, the individual sieves are partitioned among the
 * threads of the worker process using the parallel loop schedule specified by
 * the last command line argument. If this argument is missing, the default is
 * to divide the individual sieves evenly among the threads (i.e. no load
 * balancing). For further information, see the <TT>parse()</TT> method in class
 * {@linkplain benchmarks.detinfer.pj.edu.ritpj.IntegerSchedule IntegerSchedule}.
 * <P>
 * Usage: java -Dpj.np=<I>Kp</I> -Dpj.nt=<I>Kt</I> [
 * -Dpj.schedule=<I>procschedule</I> ] benchmarks.detinfer.pj.edu.rithyb.prime.PrimeCountFunctionHyb
 * <I>x</I> <I>primefile</I> [ <I>thrschedule</I> ]
 * <BR><I>Kp</I> = Number of parallel processes
 * <BR><I>Kt</I> = Number of parallel threads per process
 * <BR><I>procschedule</I> = Load balancing schedule for processes
 * <BR><I>x</I> = Argument of prime counting function, 0 &lt;= <I>x</I> &lt;=
 * 2<SUP>63</SUP>-1
 * <BR><I>primefile</I> = Prime file name
 * <BR><I>thrschedule</I> = Load balancing schedule for threads
 * <P>
 * The computation is performed in parallel in multiple processors. The program
 * measures the total running time.
 *
 * @author  Alan Kaminsky
 * @version 05-Jun-2008
 */
public class PrimeCountFunctionHyb
	{

// Prevent construction.

	private PrimeCountFunctionHyb()
		{
		}

// Shared global variables.

	// Sieve in one-million-number chunks.
	static final int CHUNK = 1000000;

	// World communicator.
	static Comm world;
	static int size;
	static int rank;

	// Command line arguments.
	static long x;
	static File primefile;
	static LongSchedule thrschedule;

	// Parallel team.
	static ParallelTeam team;

	// Per-thread sieves.
	static Sieve[] sieves;

	// List of 32-bit primes.
	static Prime32List primeList;

	// For counting primes.
	static SharedLong primeCount = new SharedLong (0);

// Main program.

	/**
	 * Main program.
	 */
	public static void main
		(String[] args)
		throws Exception
		{
		// Start timing.
		long t1 = System.currentTimeMillis();

		// World communicator.
		Comm.init (args);
		world = Comm.world();
		size = world.size();
		rank = world.rank();

		// Parse command line arguments.
		if (args.length < 2 || args.length > 3) usage();
		x = Long.parseLong (args[0]);
		if (x < 0) usage();
		primefile = new File (args[1]);
		thrschedule =
			args.length == 3 ?
				LongSchedule.parse (args[2]) :
				LongSchedule.fixed();

		// Set up parallel team and per-thread sieves.
		team = new ParallelTeam();
		sieves = new Sieve [team.getThreadCount()];
		for (int i = 0; i < sieves.length; ++ i)
			{
			sieves[i] = new Sieve (0, CHUNK);
			}

		// Set up list of 32-bit primes.
		primeList = new Prime32List (primefile);

		// In master process, run master section and worker section in parallel.
		if (rank == 0)
			{
			new ParallelTeam(2).execute (new ParallelRegion()
				{
				public void run() throws Exception
					{
					execute (new ParallelSection()
						{
						public void run() throws Exception
							{
							masterSection();
							}
						},
					new ParallelSection()
						{
						public void run() throws Exception
							{
							workerSection();
							}
						});
					}
				});
			}

		// In worker process, run only worker section.
		else
			{
			workerSection();
			}

		// Reduce prime counts into process 0.
		LongItemBuf buf = LongBuf.buffer (primeCount.longValue());
		world.reduce (0, buf, LongOp.SUM);

		// Stop timing.
		long t2 = System.currentTimeMillis();

		// Print the answer. (Add 1 because 2 is a prime.)
		if (rank == 0)
			{
			System.out.println ("pi("+x+") = "+(buf.item+1));
			}
		System.out.println ((t2-t1)+" msec "+rank);
		}

// Hidden operations.

	/**
	 * Perform the master section.
	 *
	 * @exception  Exception
	 *     Thrown if an I/O error occurred.
	 */
	private static void masterSection()
		throws Exception
		{
		int worker;
		LongRange range;

		// Determine number of sieves to calculate.
		long ns = (x + CHUNK - 1) / CHUNK;

		// Set up a schedule object.
		LongSchedule schedule = LongSchedule.runtime();
		schedule.start (size, new LongRange (0, ns-1));

		// Send initial sieve range to each worker. If range is null, no more
		// work for that worker. Keep count of active workers.
		int activeWorkers = size;
		for (worker = 0; worker < size; ++ worker)
			{
			range = schedule.next (worker);
			world.send (worker, ObjectBuf.buffer (range));
			if (range == null) -- activeWorkers;
			}

		// Repeat until all workers have finished.
		while (activeWorkers > 0)
			{
			// Receive an empty message from any worker.
			CommStatus status = world.receive (null, IntegerBuf.emptyBuffer());
			worker = status.fromRank;

			// Send next chunk range to that specific worker. If null, no more
			// work.
			range = schedule.next (worker);
			world.send (worker, ObjectBuf.buffer (range));
			if (range == null) -- activeWorkers;
			}
		}

	/**
	 * Perform the worker section.
	 *
	 * @exception  Exception
	 *     Thrown if an I/O error occurred.
	 */
	private static void workerSection()
		throws Exception
		{
		// Process chunks from master.
		for (;;)
			{
			// Receive sieve range from master. If null, no more work.
			ObjectItemBuf<LongRange> rangeBuf = ObjectBuf.buffer();
			world.receive (0, rangeBuf);
			LongRange range = rangeBuf.item;
			if (range == null) break;
			final long lb = range.lb();
			final long ub = range.ub();

			// Calculate sieves in parallel threads.
			team.execute (new ParallelRegion()
				{
				public void run() throws Exception
					{
					execute (lb, ub, new LongForLoop()
						{
						// Per-thread variables plus extra padding.
						Sieve thrSieve;
						long thrPrimeCount;
						long p0, p1, p2, p3, p4, p5, p6, p7;
						long p8, p9, pa, pb, pc, pd, pe, pf;

						// Use the thread-level loop schedule.
						public LongSchedule schedule()
							{
							return thrschedule;
							}

						// Initialize per-thread variables.
						public void start()
							{
							thrSieve = sieves[getThreadIndex()];
							thrPrimeCount = 0;
							}

						// Calculate all sieves.
						public void run (long first, long last) throws Exception
							{
							for (long lb = first; lb <= last; ++ lb)
								{
								// Get an iterator for the odd primes.
								LongIterator iter = primeList.iterator();

								// Calculate the sieve.
								thrSieve.lb (lb*CHUNK);
								thrSieve.initialize();
								thrSieve.sieveOut (iter);

								// Count primes <= x left in the sieve.
								iter = thrSieve.iterator();
								long p;
								while ((p = iter.next()) != 0 && p <= x)
									{
									++ thrPrimeCount;
									}
								}
							}

						// Reduce per-thread prime count into global prime
						// count.
						public void finish()
							{
							primeCount.addAndGet (thrPrimeCount);
							}
						});
					}
				});

			// Report completion of sieve range to master.
			world.send (0, IntegerBuf.emptyBuffer());
			}
		};

	/**
	 * Print a usage message and exit.
	 */
	private static void usage()
		{
		System.err.println ("Usage: java -Dpj.np=<Kp> -Dpj.nt=<Kt> [ -Dpj.schedule=<procschedule> ] benchmarks.detinfer.pj.edu.rithyb.prime.PrimeCountFunctionHyb <x> <primefile> [ <thrschedule> ]");
		System.err.println ("<Kp> = Number of parallel processes");
		System.err.println ("<Kt> = Number of parallel threads per process");
		System.err.println ("<procschedule> = Load balancing schedule for processes");
		System.err.println ("<x> = Argument of prime counting function, 0 <= <x> <= 2^63-1");
		System.err.println ("<primefile> = Prime file name");
		System.err.println ("<thrschedule> = Load balancing schedule for threads System.exit (1);");
		}

	}
